*********** User input: define parameters of the analytic sample *******

	cap program drop keep_analytic_sample
	program keep_analytic_sample	
		* Only physicians in NPPES
		keep if physician_nppes==1
		* Keep only years 2005-2017
		keep if year>2004 
		* Keep only individual-year observations with a CZ code
		keep if !missing(cz90)
		* Keep only individual-year observations for age 20 to 70
		keep if inrange(age,20,70)
	end	
	
************************************************************************
 
	use "${build_data}/panel_physicians", clear
	
	local npiactions provider_enumeration last_update npi_deactivation npi_reactivation 
	foreach npiaction of local npiactions {
	gen `npiaction'_datenum=date(`npiaction'_date, "MDY")
	gen `npiaction'_year=year(`npiaction'_datenum)
	drop `npiaction'_date `npiaction'_datenum
	}
	rename last_update_year last_npi_update_year
	
	destring npi, replace	
	destring personid, gen(id) 


	***
	***
	
    * Demographics 

	gen age = year-birthyr
	drop if missing(age) 
	drop if deathyr<=year // not including the year of death
	drop if birthyr>year  // drops individuals with missing birthyr	
	
	gen female=(gender=="F")
	drop gender
	gen married=(filing_status==2 | filing_status==3) if !missing(filing_status)
	gen spouse_in_labor_force=(!missing(count_w2s_spouse)) if married==1 & year>2004
	fmerge m:1 personid_spouse using "${intermediate_data}/spouse_physician/npi_spouse.dta", keep(master match) nogen
	gen spouse_doc=(!missing(npi_spouse)) if married==1

	
	***
	***

	* Geography 

	rename (stfips county zcta) (m_statecode m_cnty m_zip_area)

	local t "informational file" 
	local m "individual file"
	
	gen stfips = m_statecode if mvar=="`t'"
	replace stfips = substr(cfips_from_zip,1,2) if stfips=="" & zvar=="`t'"
	replace stfips = m_statecode if stfips=="" & mvar=="`m'"
	replace stfips = substr(cfips_from_zip,1,2) if stfips=="" & zvar=="`m'"
	replace stfips = m_statecode if stfips=="" & !inlist(mvar,"`t'","`m'")
	replace stfips = "0"+stfips if strlen(stfips) == 1

	gen cfips = m_statecode+m_cnty if mvar=="`t'"
	replace cfips = cfips_from_zip if cfips=="" & zvar=="`t'"
	replace cfips = m_statecode+m_cnty if cfips=="" & mvar=="`m'"
	replace cfips = cfips_from_zip if cfips=="" & zvar=="`m'"
	replace cfips = m_statecode+m_cnty if cfips=="" & !inlist(mvar,"`t'","`m'")
	replace cfips = "0" + cfips if strlen(cfips) == 4

	tostring(zipfive), replace
	replace zipfive="0"+zipfive if length(zipfive)==4
	replace zipfive="00"+zipfive if length(zipfive)==3
	replace zipfive="" if length(zipfive)!=5
	
	merge m:1 zipfive using "${intermediate_data}/crosswalks/zip_to_cfips_xwalk.dta", keep(master match) nogen
	
	replace cfips = zcfips if mi(cfips) & !missing(zipfive)
	replace stfips = substr(zcfips, 1, 2) if mi(stfips) & !missing(zipfive)
		
	merge m:1 cfips using "${intermediate_data}/crosswalks/cfips_cz_state_xwalk.dta", keep(master match) nogen 
	drop cz80 
	
	destring cz90, replace
	

	***
	***

	* Income
	* All dollar-denominated variables onverted to 2017 dollars
	* See Appendix B.2 for explanation of income measures
	
	fmerge m:1 year using  "${raw_data}/cps/cpi99.dta", assert(match using) keep(match) nogen 
	
	foreach var in aginc w2wgs_salary amount_tax_dividends amount_tax_interest social_security schde_profit_loss ///
			amount_tax_exempt_inc total_money_income tax_distr_1 tax_distr_2 w2wgs total_compensation ///
			w2wgs_spouse total_compensation_spouse {
	replace `var' = `var' * cpi99 * conversion_const1999to2017  if !missing(`var')
	}
					
	foreach var in w2wgs_salary amount_tax_dividends amount_tax_interest social_security schde_profit_loss amount_tax_exempt_inc tax_distr_1 tax_distr_2 {
	replace `var' = 0 if missing(`var') & !missing(aginc)
	}
			
	foreach var in w2wgs total_compensation w2wgs_spouse total_compensation_spouse {
	replace `var' = 0 if missing(`var') & year>2004
	}
	
	gen filer_flag=(!missing(aginc))
	
		* Taxable social-security income 
	
	gen txsocial_security = .
	replace txsocial_security = 0 if social_security == 0
	replace txsocial_security = 0 if (aginc+amount_tax_exempt_inc+social_security*.5)<0
	replace txsocial_security = 0 if (aginc+amount_tax_exempt_inc+social_security*.5-(25000+7000*(filing_status==2)))<0 & txsocial_security==.
	replace txsocial_security = min(min((.5*(min(aginc+amount_tax_exempt_inc+social_security*.5-(25000+7000*(filing_status==2)),9000+3000*(filing_status==2)))),(social_security*.5)) + .85*(aginc+amount_tax_exempt_inc+social_security*.5-(25000+7000*(filing_status==2))-(9000+3000*(filing_status==2))),social_security*.85) if txsocial_security==.
	
		* Taxable pension distributions
	
	gen txpension=(tax_distr_1+tax_distr_2)*(age>59)
	
		* Wage earnings
	
	gen totw2comp=total_compensation 
	
		* Individual total income
	
	gen ptotinc = totw2comp + (aginc - w2wgs_salary - txpension + amount_tax_exempt_inc + (social_security-txsocial_security))  if filer_flag==1 & filing_status==2 & spouse_doc==0
	replace ptotinc = totw2comp + 0.5*(aginc - w2wgs_salary - txpension + amount_tax_exempt_inc + (social_security-txsocial_security))   if filer_flag==1 & filing_status==2 & spouse_doc==1
	replace ptotinc = totw2comp + (aginc - w2wgs_salary - txpension + amount_tax_exempt_inc + (social_security-txsocial_security)) if filer_flag==1 & filing_status!=2 
	replace ptotinc = totw2comp if filer_flag==0
	assert !missing(ptotinc) if year>2004
	
		* AGI percentiles 

	merge m:1 year using "${raw_data}/agi_percentiles/agi_percentiles.dta", keep(master match) nogen keepusing(pctval99) 
	gen top1 = (aginc > pctval99) if filer_flag==1 
	replace top1 = 0 if filer_flag==0
	
		* Individual business income

	gen pftloss = total_money_income - w2wgs_salary - amount_tax_dividends - amount_tax_interest - social_security - schde_profit_loss - tax_distr_1 - tax_distr_2 if filer_flag==1 & filing_status==2 & spouse_doc==0
	replace pftloss = 0.5*(total_money_income - w2wgs_salary - amount_tax_dividends - amount_tax_interest - social_security - schde_profit_loss - tax_distr_1 - tax_distr_2) if filer_flag==1 & filing_status==2 & spouse_doc==1
	replace pftloss = total_money_income - w2wgs_salary - amount_tax_dividends - amount_tax_interest - social_security - schde_profit_loss - tax_distr_1 - tax_distr_2 if filer_flag==1 & filing_status!=2 
	replace pftloss = 0 if filer_flag==0
	assert !missing(pftloss) if year>2004

	gen with25Kbizinc=(!(missing(pftloss)) & pftloss>25000)

		* Other measures

	gen profinc = totw2comp + pftloss if filer_flag==1 
	replace profinc = totw2comp if filer_flag==0
	assert !missing(profinc) if year>2004
	
	* Derived measures of income 
	
	gen logptotinc=log(ptotinc)
	gen logaginc=log(aginc)
	gen hourly_income=ptotinc/(52*wkh) 


	***
	***
	
	* Retirement
	
		* Received at least one 1099SSA form
	
	gen received_1099ssa = (!missing(count1099forms) & count1099forms>0)
	gegen temp = min(year) if received_1099ssa==1 & age >= 40, by(personid)
	gegen retiredyear_1099ssa = max(temp), by(personid)
 	drop temp
 	gen retired_1099ssa = (year >= retiredyear_1099ssa)
	
		* Ratio of all wages to AGI 
	
	gen ratio_wage = w2wgs_salary/total_money_income
	gen drop_ratio_wage = (ratio_wage<0.5) 
	gegen temp = min(year) if drop_ratio_wage==1 & age >= 40, by(personid)
	gegen retiredyear_w2wgsal = max(temp), by(personid)
 	drop temp
 	gen retired_w2wgsal = (year >= retiredyear_w2wgsal)


	***
	***

 	* Add specialty information
	
	gen str11 pr_taxonomy_code = ""
	forvalues varnum = 1(1)15 {
		replace pr_taxonomy_code = prov_taxonomy_code_`varnum' if prov_taxonomy_switch_`varnum'=="Y"
	}

	merge m:1 pr_taxonomy_code using "${intermediate_data}/crosswalks/recoded_specialty_crosswalk.dta", keep(master match) nogen


	***
	***

	* Add medical school information

	fmerge m:1 npi using "${intermediate_data}/physician_compare/physician_compare_cleaned.dta", keep(master match) nogen
	
	gen observe_mdsch=(!missing(med_sch))	
	fmerge m:1 med_sch using "${intermediate_data}/medical_school_ranking/medical_school_ranks.dta", keep(master match) keepusing(newsrank*) nogen
	egen minnewsrank=rowmin(newsrank20*)
	gen ranked_school=(!missing(minnewsrank)) if observe_mdsch==1
	gen top5_school=(minnewsrank<6) if observe_mdsch==1
	drop newsrank*	minnewsrank
	

	***
	***

	* Self-employment

		* Sole proprietor in NPPES (in 2017)
	
	gen se_nppes = (is_sole_proprietor == "Y") if inlist(is_sole_proprietor,"Y","N") & year==2017
	label var se_nppes "Sole Proprietor based on NPPES flag"
		
		* Self employed in ACS (at a point in time)

	gen se_acs = inlist(cow,6,7) if physician_acs == 1 & year==year_survey
	label var se_acs "Self Employed based on ACS (measured in year of survey)"
	
		* Filed shedules C, or E, or SE

	gen se_schedules = max(schedule_c_flag, schedule_e_flag, schd_se)


	***
	***

	* Add ACS variables (note: these are not a panel)

		* Weekly work hours 
	
	replace wkh = . if year != year_survey
	gen wkh_imputed = wkh
	replace wkh_imputed = 0 if mi(wkh) & esr == 6 & year == year_survey
	
		* Number of weeks worked

	gen wkw_imputed = wkw
	replace wkw_imputed = 51 if wkw2 == 1 & missing(wkw) & year == year_survey
	replace wkw_imputed = 48.5 if wkw2 == 2 & missing(wkw) & year == year_survey
	replace wkw_imputed = 43.5 if wkw2 == 3 & missing(wkw) & year == year_survey
	replace wkw_imputed = 33 if wkw2 == 4 & missing(wkw) & year == year_survey
	replace wkw_imputed = 20 if wkw2 == 5 & missing(wkw) & year == year_survey
	replace wkw_imputed = 7 if wkw2 == 6 & missing(wkw) & year == year_survey
	
		* Government employee

	gen govt_acs = inlist(cow,3,4,5) if physician_acs == 1 & year == year_survey
	label var govt_acs "Government Employee based on ACS"
	
		* Flag for being observed in ACS

	gen observed_in_acs=(!missing(year_survey))	
	
		* ACS measure of income

	replace wag = 0 if missing(wag) & year == year_survey
	replace sem = 0 if missing(sem) & year == year_survey
	replace sem_sp = 0 if missing(sem_sp) & year == year_survey
	
	gen income_acs=wag + sem + sem_sp if spouse_doc!=1 & year == year_survey
	replace income_acs=wag + 0.5*(sem + sem_sp) if spouse_doc==1 & year == year_survey
	replace income_acs=. if year != year_survey


	***
	***

	* Firms

	gen countvar = 1

		* Define firms
		* Assign solo proprietor firm id if no EIN, but have C, E, or SE schedules

	gen ein = ein_num if physician_nppes==1
	replace ein = personid if physician_nppes==1 & se_schedules == 1 & mi(ein_num)
	
		* Employer size (physicians only), time-varying

	gegen einsize = sum(countvar) if !missing(ein), by(ein year)
	label var einsize "Size of EIN (physicians only) in a given year"
	
	gen einsize_discrt=1 if einsize==1
	replace einsize_discrt=2 if einsize==2
	replace einsize_discrt=3 if einsize==3
	replace einsize_discrt=4 if einsize==4
	replace einsize_discrt=5 if einsize==5
	replace einsize_discrt=25 if inrange(einsize, 6, 25)
	replace einsize_discrt=45 if inrange(einsize, 26, 45)
	replace einsize_discrt=100 if inrange(einsize, 46, 100)
	replace einsize_discrt=400 if inrange(einsize, 101, 400)
	replace einsize_discrt=500 if einsize>401 & !missing(einsize)

		* Share of physicians in the same Medicare specialty
	
	gegen einsize_by_cmsspec = sum(countvar) if !missing(ein), by(ein year cms_spec_code)
	gen samecmsspec_share=einsize_by_cmsspec/einsize
	label var samecmsspec_share "Share in same CMS specialty in EIN in a given year"

	drop countvar einsize_by_cmsspec 
	

	
	***
	***


	keep  id personid npi source year year_survey stfips cfips cz90 cz00 zipfive ///
	      source physician_nppes physician_acs ///
	      pr_taxonomy_code pr_taxonomy_desc cms_spec_code ///
	      cms_spec_desc aamc_spec_desc spec_category_code spec_category_desc ///
	      nrmp_spec_desc nrmp_spec_code ///
	      provider_enumeration_year last_npi_update_year ///
	      npi_deactivation_year npi_reactivation_year ///
	      ein einsize einsize_discrt samecmsspec_share ///	      
	      grd_yr cred med_sch observe_mdsch ranked_school top5_school ///
	      age age_survey female race birthyr deathyr is_foreign ctzn_stat birthplace_char birthplace_num ///
	      personid_spouse married filing_status spouse_in_labor_force ///
	      npi_spouse spouse_doc ///
	      filer_flag totw2comp ptotinc logptotinc hourly_income ///
	      aginc logaginc top1 pctval* pftloss with25Kbizinc ///
	      profinc  ///	      
	      w2wgs w2wgs_salary amount_tax_dividends amount_tax_interest social_security schde_profit_loss ///
	      amount_tax_exempt_inc total_money_income tax_distr_1 tax_distr_2  ///
	      w2wgs_spouse total_compensation_spouse ///
	      schedule_c_flag schedule_e_flag schd_se ///
	      received_1099ssa retiredyear_1099ssa ///
	      retired_1099ssa retiredyear_w2wgsal retired_w2wgsal ///
	      se_nppes se_acs se_schedules observed_in_acs govt_acs ///
	      income_acs wkh wkh_imputed esr wag sem sem_sp wkw_imputed

	order id personid npi source year year_survey stfips cfips cz90 cz00 zipfive ///
	      source physician_nppes physician_acs ///
	      pr_taxonomy_code pr_taxonomy_desc cms_spec_code ///
	      cms_spec_desc aamc_spec_desc spec_category_code spec_category_desc ///
	      nrmp_spec_desc nrmp_spec_code ///
	      provider_enumeration_year last_npi_update_year ///
	      npi_deactivation_year npi_reactivation_year ///
	      ein einsize einsize_discrt samecmsspec_share ///	      
	      grd_yr cred med_sch observe_mdsch ranked_school top5_school ///
	      age age_survey female race birthyr deathyr is_foreign ctzn_stat birthplace_char birthplace_num ///
	      personid_spouse married filing_status spouse_in_labor_force ///
	      npi_spouse spouse_doc ///
	      filer_flag totw2comp ptotinc logptotinc hourly_income ///
	      aginc logaginc top1 pctval* pftloss with25Kbizinc ///
	      profinc ///	      
	      w2wgs w2wgs_salary amount_tax_dividends amount_tax_interest social_security schde_profit_loss ///
	      amount_tax_exempt_inc total_money_income tax_distr_1 tax_distr_2  ///
	      w2wgs_spouse total_compensation_spouse ///
	      schedule_c_flag schedule_e_flag schd_se ///
	      received_1099ssa retiredyear_1099ssa ///
	      retired_1099ssa retiredyear_w2wgsal retired_w2wgsal ///
	      se_nppes se_acs se_schedules observed_in_acs govt_acs ///
	      income_acs wkh wkh_imputed esr wag sem sem_sp wkw_imputed	      
	      
	gsort id year
	
	keep_analytic_sample
	
	gisid id year
	
	save "${clean_data}/panel_physicians_clean.dta", replace
